pacman::p_load("patchwork", "tmap", "ExPanDaR", "kableExtra", "ggstatsplot", "tidyverse")EDA: Overview of Suicide Rate in a Particular Year
Step-by-Step Data Preparation
1. Installing and launching required R packages
2. Loading the data
suicidedata_eda_formap <- read_csv("data/suicidedata_eda_formap.csv", show_col_types = FALSE)suicidedata_eda <- read_csv("data/suicidedata_eda.csv", show_col_types = FALSE)3. Dataset overview
We will check the suicidedata_eda as suicidedata_eda_formap is meant to be combined with the sf file (world)
3.1 Missing Values
missing.values <- suicidedata_eda %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
summarise(num.missing = n()) %>%
filter(is.missing==T) %>%
select(-is.missing) %>%
arrange(desc(num.missing))
missing.values %>% kable()| key | num.missing |
|---|---|
| DN | 18360 |
| DR | 18360 |
| POP | 18360 |
| SN | 18360 |
| SP | 18360 |
Visualising Missing Values
missing_values <- function(vari = "age_name"){
prepare_missing_values_graph(suicidedata_eda, ts_id = vari)
}By country
missing_values("country")
By gender
missing_values("sex_name")
By age group
missing_values("age_name")
By year
missing_values("year")
Conclusion : The missing values are because of age-standardised (only for Suicide Rates)
3.2 Descriptive statistics
descr_stats <- function(year, gender = "Both", age = "All ages") {
descr <- prepare_descriptive_table(suicidedata_eda %>%
filter(year == year,
sex_name == gender,
age_name == age) %>%
select(!c(6,8)) %>%
rename("Number of suicide" = "SN",
"Number of deaths" = "DN",
"Share of deaths from suicide (%)" = "SP",
"Suicide rate" = "SR",
"Mortality rate" = "DR"))
descr$kable_ret %>%
kable_styling("condensed", full_width = F, position = "center")
}descr_stats(2010, "Both", "All ages")| N | Mean | Std. dev. | Min. | 25 % | Median | 75 % | Max. | |
|---|---|---|---|---|---|---|---|---|
| Number of suicide | 6,120 | 3,874.828 | 18,425.617 | 0.126 | 93.934 | 532.835 | 1,882.305 | 220,356.720 |
| Number of deaths | 6,120 | 252,265.821 | 920,824.636 | 10.886 | 7,358.955 | 51,487.069 | 169,570.300 | 10,659,491.040 |
| Share of deaths from suicide (%) | 6,120 | 1.394 | 1.067 | 0.105 | 0.691 | 1.118 | 1.787 | 11.812 |
| Suicide rate | 6,120 | 11.339 | 9.395 | 1.427 | 5.424 | 8.224 | 14.601 | 95.565 |
| Mortality rate | 6,120 | 847.052 | 354.531 | 148.258 | 611.956 | 783.986 | 1,019.318 | 10,705.808 |
3.3 Distribution
3.3.1 Histogram of multiple variables
plot_multi_distribution <- function(year, gender = "Both", age = "All ages") {
suicidedata_hist <- suicidedata_eda %>%
filter(year == year,
sex_name == gender,
age_name == age) %>%
select(c(7,9,10,11,12)) %>%
rename("Number of suicide" = "SN",
"Number of deaths" = "DN",
"Share of deaths from suicide (%)" = "SP",
"Suicide rate" = "SR",
"Mortality rate" = "DR")
suicidedata_hist %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap( ~key, ncol=3, scales="free") +
geom_histogram()
}plot_multi_distribution(2010, "Both", "All ages")
3.3.2 Histogram of individual variables
plot_distribution <- function(year, gender = "Both", age = "All ages") {
suicidedata_hist <- suicidedata_eda %>%
filter(year == year,
sex_name == gender,
age_name == age)
#computing summary statistics of mean, median and lower and upper whiskers in boxplot
var_mean <- round(mean(suicidedata_hist$SR))
var_median <- round(median(suicidedata_hist$SR))
ymax <- as.numeric(round((IQR(suicidedata_hist$SR)*1.5) +
quantile(suicidedata_hist$SR,0.75)))
ymin <- as.integer(min(suicidedata_hist$SR))
#plotting histogram
h <- suicidedata_hist %>%
ggplot(aes(x = SR)) +
geom_histogram(color="black", fill="azure4") +
scale_x_continuous(labels = scales::comma) +
labs(x = paste0("Suicide Rate, ",gender,", ",age,", ", year), y = "Count") +
geom_vline(aes(xintercept = var_mean), col="darkblue", linewidth=1, linetype = "dashed") +
annotate("text", x=17, y=1750, label="Mean:",
size=4, color="darkblue") +
annotate("text", x=17, y=1650, label=format(var_mean, big.mark = ","),
size=4, color="darkblue") +
geom_vline(aes(xintercept = var_median), col="lightpink4", linewidth=1, linetype = "dashed") +
annotate("text", x=0, y=1750, label="Median:",
size=4, color="lightpink4") +
annotate("text", x=0, y=1650, label=format(var_median, big.mark = ","),
size=4, color="lightpink4") +
theme(axis.text.x = element_text(size=8))
#plotting boxplot
b <- suicidedata_hist %>%
ggplot(aes(y = SR)) +
geom_boxplot(outlier.colour="firebrick", outlier.shape=16,
outlier.size=1, notch=FALSE) +
coord_flip() + labs(y = "", x = "") +
scale_y_continuous(labels = scales::comma) +
theme(axis.text = element_blank(), axis.ticks = element_blank()) +
stat_boxplot(geom="errorbar", width=0.5) +
annotate("text", x=0.35, y=ymax, label=format(ymax, big.mark = ","),
size=3, color="lightpink4") +
annotate("text", x=0.35, y=ymin, label=format(ymin, big.mark = ","),
size=3, color="lightpink4")
#combining plots
suicide_distri <- b / h + plot_layout(heights = c(1, 4))
suicide_distri
}plot_distribution(2010, "Both", "All ages")
4. Plotting the choropleth map
4.1 Joining with world map (tmap object)
The object World is a spatial object of class sf from the sf package; it is a data.frame with a special column that contains a geometry for each row, in this case polygons
Reference - https://cran.r-project.org/web/packages/tmap/vignettes/tmap-getstarted.html
data("World")Joining the two dataframes together
suicidedata_eda_map <- left_join(World,
suicidedata_eda_formap %>% mutate(across(where(is.numeric), round, 2)),
by = c("iso_a3" = "code")) %>%
select(!c(2,3,4,6,7,8,9,10,11,12,13,14,15)) %>%
mutate(area = as.numeric(str_remove(`area`,
" \\[km\\^2\\]")),
.after = region) %>%
na.omit()Creating a function to plot
plot_map_eda <- function(year, metric = "SR", gender = "T", style = "jenks"){
metric_text = case_when(metric == "SR" ~ "Suicide rate",
metric == "SP" ~ "Share of deaths from suicide (%)",
metric == "SN" ~ "Number of suicide",
metric == "DN" ~ "Number of deaths",
metric == "DR" ~ "Mortality rate")
age = case_when(metric == "SR" ~ "AS",
metric == "SP" ~ "All",
metric == "SN" ~ "All",
metric == "DN" ~ "All",
metric == "DR" ~ "All")
gender_text = case_when(gender == "T" ~ "Total",
gender == "M" ~ "Male",
gender == "F" ~ "Female")
tmap_mode("view")
tm_shape(suicidedata_eda_map |>
filter(year == year))+
tm_fill(paste0(metric,"_",age,"_",gender),
style = style,
palette="YlOrBr",
id = "country",
title = paste0(metric_text, ", ",gender_text,", ", year),
popup.vars = c(value = paste0(metric,"_",age,"_",gender))) +
tm_borders(col = "grey20",
alpha = 0.5)
}plot_map_eda(2016, "SR", "M", "jenks")